;include 'labs.inc' line by profiling purposes
make_matrixx:
; in:     eax, ebx, edx  - rot params;
; out:    edi - ptr to  desired matrix
    push        ebp
    mov         ebp,esp
    sub         esp,28
    .sincosx    equ [ebp-8]
    .sincosy    equ [ebp-16]
    .sincosz    equ [ebp-24]
    .pi         equ [ebp-28]
    .sincos     equ [ebp-28]
    .angles     equ [ebp-44]
    .angles2    equ [ebp-40]
    .angles3    equ [ebp-36]
    mov         dword .pi,3.1415926
    movzx       edx,dl ;0xff
    movzx       ebx,bl ;0xff
    movzx       eax,al ;0xff
    push        eax ebx edx
    pcmpeqd     xmm3,xmm3
    movups      xmm0,[esp]
    psrld       xmm3,25    ; make 128 X 4
    movlps      xmm1,.pi
    cvtdq2ps    xmm0,xmm0
    cvtdq2ps    xmm3,xmm3
    shufps      xmm1,xmm1,0
    mulps       xmm0,xmm1
    divps       xmm0,xmm3
    movups      .angles,xmm0
    call        sin_cos
    movlps      .sincosx,xmm0
    movlps      xmm0,.angles2
    call        sin_cos
    movlps      .sincosy,xmm0
    movlps      xmm0,.angles3
    call        sin_cos
    movhps      xmm0,.sincosy
    movlps      xmm1,.sincosx
    ; xm1 - l->h  sinx, cosx
    ; xm0 - l->h  sinz, cosz siny cosy
    call        make_rotation_matrix
  ; edi -  desired matrix
 ;   mov         esp,ebp
 ;   pop         ebp
;ret
    leave
;=======================================================
sin_cos:
; Calculating trigonometric functions
; Taylor series implementation
;  in:
;    xmm0 - 2 angles floats (radians); lowest half  x, y
;  out:
;    xmm0 - sinx cosx atanx atany 4 dwords float
      ; sin  x = x - x^3/3! + x^5/5! -  ...
      ; cos  x = 1 - x^2/2! + x^4/4! -  ...
      ; atan x = x - x^3/3  + x^5/5  -  ...
      pushad
      movaps    xmm7,xmm0
      pcmpeqd   xmm5,xmm5
      movhps    xmm7,[the_one]
      shufps    xmm7,xmm7,01001000b ; xm7 = lo -> hi  x, 1.0, x, y
      xorps     xmm3,xmm3
      mov       ecx,3
      mulps     xmm0,xmm0           ; x^2
      pslld     xmm5,31
      movaps    xmm1,xmm7
      shufps    xmm0,xmm0,0
      ; xm7 lo - hi x,   1.0, x,   y
      ; xm1 lo - hi x,   1.0, x,   y
      ; xm0 lo - hi x^2, x^2, x^2, x^2
      mov       edx,1
     @@:
      mulps     xmm1,xmm0           ; lo - hi x^3  x^2
      mov       ebx,ecx
      mov       eax,ecx
      dec       ebx
      sub       eax,2
      push      edx eax ebx ecx
      movups    xmm2,[esp]
      cvtdq2ps  xmm2,xmm2           ; 3, 2, 1      |   5, 4, 3
      movaps    xmm4,xmm2
      add       esp,16
      shufps    xmm4,xmm4,00001001b
      shufps    xmm2,xmm2,11110100b
      mulps     xmm4,xmm2           ; lo -> hi = 3*2, 1*2
      rcpps     xmm4,xmm4
      mulps     xmm1,xmm4           ; x^2/2!      |   x^4/4!
      movaps    xmm6,xmm1
      xorps     xmm6,xmm3           ; -x^3/3!     |   x^5/5!
      subps     xmm7,xmm6
      xorps     xmm3,xmm5 ;[sign_mask]
      add       ecx,2
      cmp       ecx,100
      jb        @b
      movaps    xmm0,xmm7
 ;    xmm0 - sinx cosx atanx atany  4 dwords float
      popad
ret
;=======================================================
; stack sinus, cosinus variables, edi offset to 3x3 matrix
make_rotation_matrix:
    ; xm1 - l->h  sinx, cosx
    ; xm0 - l->h  sinz, cosz siny cosy
     movaps    xmm3,xmm1 ;.sinx
     movaps    xmm7,xmm0
     movaps    xmm6,xmm0
     movaps    xmm4,xmm0
     shufps    xmm6,xmm6,11110101b    ;   .cx
     movhlps   xmm5,xmm7              ;   .sy
     shufps    xmm4,xmm4,11111111b    ;   .cy
     movaps    xmm2,xmm3
     shufps    xmm2,xmm2,01010001b    ; cz sz
     shufps    xmm7,xmm7,0
     shufps    xmm5,xmm5,0
     movaps    xmm0,xmm4 ;.cy cy
     mulps     xmm0,xmm2 ; cz sz
     xorps     xmm1,xmm1
     subps     xmm1,xmm5
     movlhps   xmm0,xmm1
     movups    [edi],xmm0
     movaps    xmm0,xmm7 ;dword .sinx
     mulps     xmm0,xmm2 ;.cosz
     mulps     xmm0,xmm5 ;dword .siny
     movaps    xmm1,xmm6 ;.cosx
     mulps     xmm1,xmm3 ;.sinz
     addsubps  xmm0,xmm1
     movlps    [edi+12],xmm0
     movaps    xmm1,xmm6 ;.cosx
     mulps     xmm1,xmm5 ;dword .siny
     mulps     xmm1,xmm3 ;.sinz
     movaps    xmm0,xmm7 ;dword .sinx
     mulps     xmm0,xmm2 ;.cosz
     subps     xmm1,xmm0
     mulps     xmm2,xmm6
     mulps     xmm2,xmm5
     mulps     xmm3,xmm7
     addps     xmm2,xmm3
     mulps     xmm7,xmm4
     movlhps   xmm7,xmm2
     mulps     xmm6,xmm4
     movlhps   xmm6,xmm1
     shufps    xmm7,xmm6,00101000b
     movups    [edi+20],xmm7
ret
;=============================================
reverse_mx_3x3:
; esi - source matrix
; edi - desired reversed matrix
;  push  ebp
;  mov   ebp,esp
;  push   esi edi
;  cld
;  movsd   ; * - *
;  push    edi
;  add     edi,8                           ;  *ab
;                                          ;  c*d
;  movsd   ; a - c                         ;  ef*
;  add     edi,8
;  movsd   ; b - e
;  pop     edi
;  movsd   ; c - a
;  add     edi,8          ; if mx is orthogonal
;  movsd   ; * - *        ; just transpose
;  add     edi,8
;  movsd   ; d - f
;  sub     edi,24
;  movsd   ; e - b
;  add     edi,8
;  movsd   ; f - d
;  add     edi,8
;  movsd   ; * - *
;  pop     edi esi
;ret
  movups  xmm0,[esi]
  movups  xmm1,[esi+12]
  movaps  xmm5,xmm1
  movups  xmm2,[esi+20]
  shufps  xmm0,xmm0,11011000b    ; 0 8 4
  movaps  xmm7,xmm2
  shufps  xmm1,xmm1,11110001b    ; 16 12 24
  shufps  xmm2,xmm2,11001011b    ; 32 28 20
  movaps  xmm4,xmm0
  mulps   xmm0,xmm1
  mulps   xmm0,xmm2
  andps   xmm0,[zero_hgst]
  haddps  xmm0,xmm0
  haddps  xmm0,xmm0
  movups  xmm6,[esi+24]
  shufps  xmm5,xmm5,11000110b    ; 20 16 12
  shufps  xmm6,xmm6,11100001b    ; 28 24 32
  mulps   xmm4,xmm5
  mulps   xmm4,xmm6
  movhlps xmm3,xmm4
  haddps  xmm4,xmm4
  addps   xmm4,xmm3
  subps   xmm0,xmm4
  rcpps   xmm0,xmm0
  shufps  xmm0,xmm0,0
  movups  xmm4,[esi+4]
  movups  xmm1,[esi+16]          ; x1 - 16 20 24 28
  movhps  xmm1,[esi+28]          ; x1 - 16 20 28 32
  movaps  xmm5,xmm1
  movhlps xmm2,xmm1              ; x2 - 28 32
  movlhps xmm2,xmm4              ; x2 - 28 32 4 8
  shufps  xmm2,xmm2,10110001b    ; x2 - 32 28 8 4
  mulps   xmm2,xmm1
  hsubps  xmm2,xmm2
  shufps  xmm5,xmm5,11110001b
  movups  xmm6,[esi+12]
  movaps  xmm3,xmm6
  shufps  xmm6,xmm6,11110011b    ; 24 12
  shufps  xmm7,xmm7,11111100b    ; 20 32
  movlhps xmm4,xmm6
  movlhps xmm5,xmm7
  mulps   xmm4,xmm5
  hsubps  xmm4,xmm4
  movlhps xmm2,xmm4
  mulps   xmm2,xmm0
  movups  [edi],xmm2
  movups  xmm4,[esi]
  movups  xmm5,[esi+24]
  movaps  xmm1,xmm3
  movaps  xmm6,xmm4
  movlhps xmm1,xmm4
  movaps  xmm2,xmm5
  movaps  xmm7,xmm3
  shufps  xmm4,xmm4,00101000b  ; 0  8   8  0
  shufps  xmm5,xmm5,10000010b  ; 32 24  24 32
  shufps  xmm3,xmm3,11111000b  ; 12 20
  movlhps xmm5,xmm3
  mulps   xmm4,xmm5
  hsubps  xmm4,xmm4
  shufps  xmm1,xmm1,10110100b  ; 12 16  4 0
  shufps  xmm2,xmm2,01000001b  ; 28 24 24 28
  mulps   xmm1,xmm2
  hsubps  xmm1,xmm1
  movlhps xmm4,xmm1
  mulps   xmm4,xmm0
  movups  [edi+16],xmm4
  shufps  xmm7,xmm7,11110001b
  mulps   xmm7,xmm6
  hsubps  xmm7,xmm7
  mulps   xmm7,xmm0
  movss   [edi+32],xmm7
;  mov  esp,ebp
;  pop  ebp
ret
;================================
cross_reg:
; src1  = xmm0
; src2  = xmm1
; out:
; cross = xmm0
      movaps  xmm2,xmm0
      movaps  xmm3,xmm1
      shufps  xmm0,xmm0,00001001b
      shufps  xmm2,xmm2,00010010b
      shufps  xmm1,xmm1,00010010b
      shufps  xmm3,xmm3,00001001b
      mulps   xmm0,xmm1
      mulps   xmm2,xmm3
      subps   xmm0,xmm2
ret
;----------- in: ----------------------------
;----------------- edi - pointer to vector --
;------------ out : none
normalize_vector:
  movups  xmm0,[edi]
; in : edi = vect
;      eax = 'leng'
; out: xm0 = lenght
;
  andps   xmm0,[zero_hgst]
  movaps  xmm1,xmm0
  mulps   xmm0,xmm0
  haddps  xmm0,xmm0
  haddps  xmm0,xmm0
  rsqrtps xmm0,xmm0
  mulps   xmm1,xmm0
  movhlps xmm0,xmm1
  movlps  [edi],xmm1
  movss   [edi+8],xmm0
ret
;norm_reg:
; in, out: xmm0 = vect to norm
; changes: xm1
;  andps   xmm0,[zero_hgst]
;  movaps  xmm1,xmm0
;  mulps   xmm0,xmm0
;  haddps  xmm0,xmm0
;  haddps  xmm0,xmm0
;  rsqrtps xmm0,xmm0
;  mulps   xmm0,xmm1
;------------------in: -------------------------
;------------------ esi - pointer to 1st vector
;------------------ edi - pointer to 2nd vector
;------------------out:
;------------------ xmm0 - broadcasted dot-product
dot_product:
        movups  xmm0,[esi]
        movups  xmm1,[edi]
        andps   xmm0,[zero_hgst]
        mulps   xmm0,xmm1
        haddps  xmm0,xmm0
        haddps  xmm0,xmm0
ret
rotary:
;   Copyright (C) 1999-2001  Brian Paul
;---------------------
;  in:  esi - ptr to points(normals], each point(normal) coeficient as dword
;       edi - ptr to rotated points(normals)
;       ebx - ptr to 3x3 (9 dwords, 36 bytes) rotation matrix
;       ecx - number of points(normals)
    jecxz    .en
    movups   xmm4,[ebx]
    movups   xmm5,[ebx+12]
    movups   xmm6,[ebx+24]
  .again:
    movlps   xmm0,[esi]     ; movlps - shorter than movss
    movlps   xmm2,[esi+4]   ;
    movaps   xmm1,xmm0      ;
    shufps   xmm0,xmm0,0
    shufps   xmm1,xmm1,01010101b
    shufps   xmm2,xmm2,01010101b
    mulps    xmm0,xmm4
    mulps    xmm1,xmm5
    mulps    xmm2,xmm6
    addps    xmm0,xmm1
    addps    xmm0,xmm2
    movhlps  xmm1,xmm0
    movlps   [edi],xmm0
    movss    [edi+8],xmm1
    add      esi,12
    add      edi,12
    loop     .again
  .en:
ret
mul_matrix4x4:
;  in:  esi - ptr to points
;       edi - ptr to multiplied points
;       ebx - ptr to 4x4 (16 dwords, 64 bytes) matrix
;       ecx - number of points (vertices)
    movups   xmm4,[ebx]
    movups   xmm5,[ebx+16]
    movups   xmm6,[ebx+32]
    movups   xmm7,[ebx+48]
  .aagain1:
    movlps   xmm0,[esi]
    movlps   xmm2,[esi+4]
    movaps   xmm1,xmm0
    shufps   xmm0,xmm0,0
    shufps   xmm1,xmm1,01010101b
    shufps   xmm2,xmm2,01010101b
    mulps    xmm0,xmm4
    mulps    xmm1,xmm5
    mulps    xmm2,xmm6
    addps    xmm0,xmm1
    addps    xmm0,xmm2
    addps    xmm0,xmm7
    movlps   [edi],xmm0
    movhlps  xmm0,xmm0
    movss    [edi+8],xmm0
    add      esi,12
    add      edi,12
    loop      .aagain1
ret
;----------------------------------------------
;  esi - pointer to 3x3 matrix
;  ebx - ptr to scale
add_scale_to_matrix:
     movss   xmm3,[ebx]
     movups  xmm0,[esi]
     shufps  xmm3,xmm3,0
     movups  xmm1,[esi+16]
     movlps  xmm2,[esi+32]
     mulps   xmm0,xmm3
     mulps   xmm1,xmm3
     mulps   xmm2,xmm3
     movups  [esi],xmm0
     movups  [esi+16],xmm1
     movlps  [esi+32],xmm2
ret